<?php

namespace App;

final class Pickup
{    
    private $cache;
    private $db;
	private $smd;
    private $cookies = [
        '__gads' => 'ID=9c432e6ebd6c09f5:T=1567396388:S=ALNI_MZpa6xcLdXjU36UEti3DPm3CuHt_A',
        '__qca'  => 'P0-1756034443-1567396389201',
        '_fbp'   => "fb.1.1567474216582.245653868",
        '_ga'    => 'GA1.2.181918219.1567396387',
        '_gid'   => 'GA1.2.328171336.1567647572',
        'acct'   => 't=SqVoRpL3dQ8zuK%2bXuRi0kTL26On0b%2bvE&s=CRPqHEqTV02J%2fAZd1evAOGPDynQI6UwH',
        'prov'   => '01cb55ee-8d06-f1f5-0d6e-028507545dfa',
    ];
    private $headers = [
        'Connection'                => 'keep-alive',
        'Cache-Control'             => 'max-age=0',
        'Upgrade-Insecure-Requests' => '1',
        'sec-fetch-mode'            => 'navigate',
        'sec-fetch-site'            => 'none',
        'sec-fetch-user'            => '?1',
        'Referer'                   => 'https://stackoverflow.com/questions',
        'User-Agent'                => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:29.0) Gecko/20100101 Firefox/29.0',
        'Accept'                    => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Accept-Encoding'           => 'gzip, deflate, br',
    ];

    public function __construct($cache, $db)
    {        
        $this->cache = $cache;
        $this->db    = $db;
    }

    public function randProxy()
    {
        if (!$this->cache->exists('proxy') || $this->cache->scard('proxy') < rand(4, 8)) {
            $this->getProxy();
        }
        $proxy = $this->cache->srandmember('proxy');
        if (preg_match('#http(s?)\://\d+\.\d+\.\d+\.\d+\:\d+#', $proxy)) {
            #\Log::out('launch', '[proxy]:' . $proxy);
            return $proxy;
        } else {
            return $this->randProxy();
        }
    }

    public function pick(int $tasker, string $url)
    {
        if (preg_match('#/(\d+)/#', $url, $qaid) > 0) {
            $qaid = $qaid[1];
            if ($this->db->getValue("select count(*) from `qa_question` where `sfid`='{$qaid}'") > 0) {
                return true;
            }
        }
        $proxy = $this->randProxy();
        try {
            $html  = client($url, null, $proxy, $this->cookies, $this->headers);			
            $timer = 0;
            while (empty($html) || strstr($html, '<title>Too Many Requests - Stack Exchange</title>')) {
                if (strstr($html, '<title>Too Many Requests - Stack Exchange</title>')) {
                    $this->cache->srem('proxy', $proxy);
                } else {
                    $this->checkProxy($proxy);
                }
                if (++$timer > 5) {
                    $html = '';
                    break;
                }
                $proxy = $this->randProxy();
                $html  = client($url, null, $proxy, $this->cookies, $this->headers);
            }
            if (empty($html)) {
                throw new \Exception("未采集到数据 in {$url}");
            }
            if (strstr($html, '<title>We are Offline</title>')) {
                usleep(30000000);
                throw new \Exception("目标已下线 in {$url}");
            }
			$html = clean_html($html);			
			preg_match('#<div\s+id\="question-header"\s+class\="grid\s+sm\:fd\-column">.*?<a[^>]*>(.*?)</a>#is', $html, $title);
			preg_match('#<div\s+class\="question"[^>]*>.*?<div\s+class\="post\-text"[^>]*>(.*?)</div>#is', $html, $question);
			preg_match('#<div\s+id\="mainbar"[^>]*>.*?<div\s+class\="post\-taglist\s[^>]*>(.*?)</div>#is', $html, $tags);
			preg_match('#<div\s+id\="mainbar"[^>]*>.*?<div\s+class\="user\-details"[^>]*>.*?<span[^>]*>(.*?)</span>#is', $html, $user);
			preg_match('#<div\s+id\="mainbar"[^>]*>.*?<div\s+class\="user\-action\-time"[^>]*>.*?<span\s+title="([^\"]+)"#is', $html, $publish_date);			
			$question = [
						'title'			=>$title[1],
						'question'		=>$question[1],
						'tags'			=>trim(strip_tags($tags[1])),
						'user'			=>$user[1],
						'publish_date'	=>$publish_date[1],
			];			
            if (empty($question['title'])) {
                throw new \Exception("May be occur error in {$url}");
            }
            $question_comment = client('https://stackoverflow.com/posts/' . $qaid . '/comments', null, $proxy, $this->cookies, $this->headers);			
			$answers = [];
			$pattern ="#<div[^>]*\s+class\=\"answer[^\"]*\"\s+data\-answerid\=\"(\d+)\"[^>]*?>(.*?)<div\s+class\=\"js\-post\-notices\s+post\-layout\-\-full\">#is";
			preg_match_all($pattern, $html, $matches);
			if(is_array($matches[2])&&!empty($matches[2])){
				foreach($matches[2] as $answerIndex=>$oneAnswer){
					preg_match('#<div\s+class\=\"post\-text\"\s+itemprop\=\"text\">(.*?)</div>#', $oneAnswer, $answer);
					preg_match('#<div\s+class\="user\-details"[^>]*>.*?<span[^>]*>(.*?)</span>#is', $oneAnswer, $user);
					preg_match('#<div\s+class\="user\-action\-time"[^>]*>.*?<span\s+title="([^\"]+)"#is', $oneAnswer, $publish_date);
					array_push($answers, [
							'answer'         => $answer[1],					
							'user'           => $user[1],
							'publish_date'   => $publish_date[1],
							'answer_comment' => client('https://stackoverflow.com/posts/' . $matches[1][$answerIndex] . '/comments', null, $proxy, $this->cookies, $this->headers),
					]);
				}
			}
            #\Log::out('launch', json_encode($question));
            #\Log::out('launch', $question_comment);
            #\Log::out('launch', json_encode($answers));			
            $qatags = [];
			if(!empty($question['tags'])){
				foreach (preg_split('/\s+/', $question['tags']) as $tag) {
					$tagid = $this->db->getValue("select `id` from `qa_tags` where `name`='{$tag}'");
					if (!$tagid) {
						$tagRow   = ['name' => $tag];
						$qatags[] = $this->db->add("qa_tags", $tagRow);
					} else {
						$qatags[] = $tagid;
					}
				}
			}
				
            $questionRow = [
                'sfid'         => $qaid,
                'title_raw'    => $question['title'],
                'question_raw' => $question['question'],
                'comment_raw'  => $question_comment,
                'tags'         => implode(',', $qatags),
                'user'         => $question['user'],
                'created_at'   => substr($question['publish_date'], 0, 19),
            ];
            $question_id = $this->db->add("qa_question", $questionRow);
            if (!empty($answers)) {
                foreach ($answers as $key => $answer) {
                    $answerRow = [
                        'question_id' => $question_id,
                        'answer_raw'  => $answer['answer'],
                        'user'        => $answer['user'],
                        'flag'        => ($key == 0) ? 1 : 0,
                        'comment_raw' => $answer['answer_comment'],
                        'created_at'  => substr($answer['publish_date'], 0, 19),
                    ];
                    $this->db->add("qa_answer", $answerRow);
                }
            }
            \Log::out('launch', "[got-{$tasker}]:" . $url . " [proxy]:" . $proxy);
        } catch (\Exception $e) {
            \Log::out('launch', "[fail-{$tasker}]:" . $e->getMessage());
            $this->cache->lpush("error", $url);
            if (preg_match('#cURL#', $e->getMessage())) {
                $this->checkProxy($proxy);
            }
        }
        //$smd->clear();
    }

    public function getProxy()
    {
        $this->getProxies1('http://www.iphai.com/free/ng');
        $this->getProxies1('http://www.iphai.com/free/wg');
        $this->getProxies1('http://www.iphai.com/free/wp');
        $this->getProxies4();
    }

    public function checkProxy($proxy)
    {
        $info = parse_url($proxy);
        if (!stream_socket_client($info['host'] . ':' . $info['port'], $errno, $errstr, 5)) {
            \Log::out('launch', "[remove proxy]:" . $proxy);
            $this->cache->srem('proxy', $proxy);
        }
    }

    public function checkAllProxy(): bool
    {
        foreach ($this->cache->smembers('proxy') as $v) {
            $info = parse_url($v);
            if (!stream_socket_client($info['host'] . ':' . $info['port'], $errno, $errstr, 5)) {
                $this->cache->srem('proxy', $v);
            }
        }
        return true;
    }

    public function getProxies1($baseuri)
    {
		$str = clean_html(file_get_contents($baseuri));		
		$proxy = [];
		preg_match('#<table.*?class="table table-bordered table-striped table-hover">.*?</table>#', $str, $proxyTxt);		
		preg_match_all('#<tr[^>]*>\s*(<td>.*?</td>)\s*</tr>#is', $proxyTxt[0], $proxyLine);
		foreach($proxyLine[1] as $v){
			preg_match_all('#<td[^>]*>\s*(.*?)\s*</td>#is', $v, $proxyOne);
			array_push($proxy, $proxyOne[1]);
		}
        if (is_array($proxy) && !empty($proxy)) {
            for ($i = 1, $j = sizeof($proxy); $i <= $j; $i++) {
                \Log::out('launch', "[check proxies]:" . $proxy[$i][0]);
                if (!empty(trim($proxy[$i][0])) && stream_socket_client(trim($proxy[$i][0]) . ":" . trim($proxy[$i][1]), $errno, $errstr, 1)) {
                    $scheme = empty($proxy[$i][3]) ? 'http' : strtolower($proxy[$i][3]);
                    $this->cache->sadd('proxy', $scheme . '://' . trim($proxy[$i][0]) . ':' . trim($proxy[$i][1]));
                }
            }
        }
    }

    public function getProxies4()
    {
        $baseuri = 'https://www.xicidaili.com/nt/';		
		$str = clean_html(file_get_contents($baseuri));		
		$proxy = [];
		preg_match('#<table.*?id="ip_list">.*?</table>#is', $str, $proxyTxt);		
		preg_match_all('#<tr[^>]*>\s*(<td[^>]*>.*?</td>)\s*</tr>#is', $proxyTxt[0], $proxyLine);
		foreach($proxyLine[1] as $v){
			preg_match_all('#<td[^>]*>\s*(.*?)\s*</td>#is', $v, $proxyOne);
			array_push($proxy, $proxyOne[1]);
		}
        if (is_array($proxy) && !empty($proxy)) {
            for ($i = 1; $i <= sizeof($proxy); $i++) {
                \Log::out('launch', "[check proxies]:" . $proxy[$i][1]);
                if (!empty(trim($proxy[$i][1])) && stream_socket_client(trim($proxy[$i][1]) . ":" . trim($proxy[$i][2]), $errno, $errstr, 1)) {
                    $this->cache->sadd('proxy', strtolower($proxy[$i][5]) . '://' . trim($proxy[$i][1]) . ':' . trim($proxy[$i][2]));
                }
            }
        }
    }

    public function randUserAgent($type = 'pc')
    {
        switch ($type) {
            case 'pc':
                return $this->userAgentArray['pc'][array_rand($this->userAgentArray['pc'])] . rand(0, 10000);
                break;
            case 'android':
                return $this->userAgentArray['android'][array_rand($this->userAgentArray['android'])] . rand(0, 10000);
                break;
            case 'ios':
                return $this->userAgentArray['ios'][array_rand($this->userAgentArray['ios'])] . rand(0, 10000);
                break;
            case 'mobile':
                $userAgentArray = array_merge($this->userAgentArray['android'], $this->userAgentArray['ios']);
                return $userAgentArray[array_rand($userAgentArray)] . rand(0, 10000);
            default:
                return $type;
                break;
        }
    }

    public $userAgentArray = [
        'pc'      => [
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
            'Mozilla/5.0 (X11; Linux x86_64; rv:29.0) Gecko/20100101 Firefox/29.0',
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:29.0) Gecko/20100101 Firefox/29.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14',
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
        ],
        'android' => [
            'Mozilla/5.0 (Android; Mobile; rv:29.0) Gecko/29.0 Firefox/29.0',
            'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36',
        ],
        'ios'     => [
            'Mozilla/5.0 (iPad; CPU OS 7_0_4 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) CriOS/34.0.1847.18 Mobile/11B554a Safari/9537.53',
            'Mozilla/5.0 (iPad; CPU OS 7_0_4 like Mac OS X) AppleWebKit/537.51.1 (KHTML, like Gecko) Version/7.0 Mobile/11B554a Safari/9537.53',
            'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0_2 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12A366 Safari/600.1.4',
            'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12A366 Safari/600.1.4',
        ],
    ];
}

/***
go(function () {
    $starton = microtime(true);	
	$cache = \RedisPool::get();
	$db    = \MysqlPool::get();
	$puer  = new Pickup($cache, $db);	
	$puer->pick(0, 'https://stackoverflow.com/questions/57862617/json-to-html-table-javascript-not-working');	
    $time = round(microtime(true) - (float)$starton, 5);
    echo '浪费计算时间共：', $time, '    浪费内存共计：', (memory_get_usage(true) / 1024), "kb\n\nDone.\n";
});
***/